Basic R

Cast dates

start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
  n_weeks <-  floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
  n_weeks <- floor(as.numeric(difftime(as.Date(d2)
    , as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
  Nocast = f1(end_date, start_date),
  Cast = f2(end_date, start_date),
  times = 1000
)
print(m1)
## Unit: microseconds
##    expr     min       lq     mean   median      uq      max neval
##  Nocast 383.726 419.1025 439.1127 421.8325 431.842 4003.758  1000
##    Cast 126.918 138.2880 148.6586 139.5460 141.855 3783.847  1000
fbox_plot(m1, "microseconds")

Explicit vector length vector(“type”, length) is faster than an empty vector c()

no_size <- function (n){
  x <- c()
  for (i in seq(n)) {
    x <- c(x, i)
  }
}
explicit_size <- function (n){
  x <- vector("integer", n)
  for (i in seq(n)) {
    x[i] <- i
  }
}
m3 <- microbenchmark(
  no_size = no_size(1e4),
  explicit_size = explicit_size(1e4),
  times = 10
)
print(m3)
## Unit: microseconds
##           expr       min        lq       mean   median        uq        max
##        no_size 71674.568 72094.832 78374.9607 72870.72 78238.637 105658.459
##  explicit_size   365.022   367.016   706.5107   382.76   409.285   3584.815
##  neval
##     10
##     10
fbox_plot(m3, "microseconds")

which function is slow for some simple situations

vector <- runif(1e8)
w1 <- function(x){
  d <- length(which(x > .5))
}
w2 <- function(x){
  d <- sum(x > .5)
}

m4 <- microbenchmark(
  which = w1(vector),
  nowhich = w2(vector),
  times = 10
)
print(m4)
## Unit: milliseconds
##     expr      min       lq     mean   median       uq      max neval
##    which 611.8463 614.1064 638.7722 615.8948 620.5165 735.8710    10
##  nowhich 216.6689 217.8526 233.3565 222.2152 222.9557 332.0488    10
fbox_plot(m4, "miliseconds")

Column operation is faster than row operation

n <- 1e4
dt <- data.table(
  a = seq(n), b = runif(n)
)
v1 <- function(dt){
  d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
  d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
  row_operation = v1(dt),
  column_operation = v2(dt),
  times = 10
)
print(m5)
## Unit: microseconds
##              expr     min      lq      mean   median      uq      max neval
##     row_operation 214.801 221.204 1002.6255 233.5710 245.498 5472.771    10
##  column_operation  79.830  91.671  310.4688  99.7055 131.165 2143.754    10
fbox_plot(m5, "microseconds")

Sequences function safer than 1:n

The function seq prevents when the second part of the 1:x is zero

num <- 1e7
s1 <- function(num){
  d <- mean(1:num)
}
s2 <- function(num){
  d <- mean(seq(num))
}
m6<-microbenchmark(
  noseq = s1(num),
  seq = s2(num),
  times = 30
)
print(m6)
## Unit: milliseconds
##   expr      min       lq     mean  median       uq      max neval
##  noseq 69.15290 69.28184 69.43360 69.3752 69.43299 71.15839    30
##    seq 69.19934 69.34238 69.55579 69.3735 69.51002 71.40295    30
fbox_plot(m6, "miliseconds")

paste0 is faster than glue

large_dataset <- data.table(
  id = 1:1000000,
  value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
  d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
  d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
  with_paste = a1(large_dataset),
  with_glue = a2(large_dataset),
  times = 20
)
print(m7)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq       max neval
##  with_paste 558.4607 562.6357 601.5903 566.0397 569.4206 1274.7545    20
##   with_glue 587.0184 588.4760 593.4911 592.0364 595.1723  622.0061    20
fbox_plot(m7, "miliseconds")

for loop vs lapply

# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)

m8 <- microbenchmark(
  lapply = lapply(big_list, mean),
  for_loop = {
    result <- list()
    for (i in seq_along(big_list)) {
      result[[i]] <- mean(big_list[[i]])
    }
  },
  times = 10
)

print(m8)
## Unit: milliseconds
##      expr      min       lq     mean   median       uq      max neval
##    lapply 321.5016 336.1584 349.1802 348.8675 354.9164 380.3998    10
##  for_loop 350.8892 366.8028 403.4082 368.8694 389.5892 648.0231    10
fbox_plot(m8, "miliseconds")

data.table package functions

Date vs IDate

dt <- data.table(
  Date = as.Date('2023-01-01') + 0:99999,
  iDate = as.IDate('2023-01-01') + 0:99999,
  Value = rnorm(100000)
)

nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')

date_between_operation <- function(nd1, nd2) {
  dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
  dt |> _[data.table::between(iDate, id1, id2)]
}

m9 <- microbenchmark(
  Date = date_between_operation(nd1, nd2),
  iDate = idate_between_operation(id1, id2),
  times = 200L
)
print(m9)
## Unit: microseconds
##   expr      min        lq      mean   median       uq      max neval
##   Date 1367.675 1608.0435 1966.2702 1982.592 2206.587 4162.645   200
##  iDate  513.229  586.5605  727.7369  652.975  850.413 2395.735   200
fbox_plot(m9, "miliseconds")

Base R switch vs Dplyr case_when (for simple tasks)

switch_function <- function(x) {
  switch(x,
         "a" = "apple",
         "b" = "banana",
         "c" = "cherry",
         "default")
}
case_when_function <- function(x) {
  case_when(
    x == "a" ~ "apple",
    x == "b" ~ "banana",
    x == "c" ~ "cherry",
    TRUE ~ "default"
  )
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
  switch = sapply(test_values, switch_function),
  case_when = sapply(test_values, case_when_function),
  times = 200L
)
print(m10)
## Unit: microseconds
##       expr        min         lq        mean      median          uq        max
##     switch    602.806    659.527    685.6361    668.0125    685.9265   2206.742
##  case_when 218507.740 232101.599 238316.3616 238497.3895 241205.5430 417453.563
##  neval
##    200
##    200
fbox_plot(m10, "microseconds")

data.table fcase vs Dplyr case_when

set.seed(123)
n <- 1e6
data <- data.table(
  id = seq(n),
  value = sample(seq(100), n, replace = TRUE)
)

casewhenf <- function(data){
  df <- data |> 
    mutate(category = case_when(
      value <= 20 ~ "Low",
      value <= 70 ~ "Medium",
      value > 70 ~ "High"))
}
fcasef <- function(data){
  df <- data |> 
    mutate(category = fcase(
      value <= 20, "Low",
      value <= 70, "Medium",
      value > 70, "High"))
}
m11 <- microbenchmark(
  case_when = casewhenf(data),
  fcase = fcasef(data),
  times = 20
)
print(m11)
## Unit: milliseconds
##       expr      min       lq     mean   median       uq      max neval
##  case_when 54.28990 59.28602 63.74796 63.49637 67.93778 72.99883    20
##      fcase 19.82819 20.79428 22.17237 21.68749 22.92994 27.45743    20
fbox_plot(m11, "miliseconds")

data.table fcoalesce vs tidyr replace_na

set.seed(123)
DT <- data.table(
  ID = 1:1e6,
  Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
  Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)

# Define the functions
replace_na_f <- function(data){
  DF <- data |> 
    mutate(Value1 = replace_na(Value1, 0),
           Value2 = replace_na(Value2, 0)) |> 
    as.data.table()
}
fcoalesce_f <- function(data){
  DF <- data |> 
    mutate(Value1 = fcoalesce(Value1, 0L),
           Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
  treplace_na = replace_na_f(DT),
  tfcoalesce = fcoalesce_f(DT),
  times = 20
)
print(m12)
## Unit: milliseconds
##         expr      min       lq     mean   median        uq       max neval
##  treplace_na 7.169631 7.391961 8.672590 7.811705 10.205576 12.617687    20
##   tfcoalesce 1.528014 1.602102 2.165468 2.045485  2.444476  4.726348    20
fbox_plot(m12, "miliseconds")

data.table notation vs dplyr notation

dt <- data.table(field_name = c("argentina.blue.man.watch", 
                                "brazil.red.woman.shoes", 
                                "canada.green.kid.hat", 
                                "denmark.red.man.shirt"))

# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
  filtered_dt <- data |> _[!grepl("red", field_name)]
}
anonymousnot <- function(data){
  filtered_dt <- data |> (\(dt) dt[!grepl("red", dt$field_name), ])()
}
dplyrnot <- function(data){
  filtered_dt <- data |> filter(!grepl("red", field_name))
}

m13 <- microbenchmark(
  anonymous_not = anonymousnot(dt),
  data_table_not = dtnot(dt),
  dplyr_not = dplyrnot(dt),
  times = 100
)
print(m13)
## Unit: microseconds
##            expr     min       lq     mean   median       uq      max neval
##   anonymous_not 105.958 111.8695 157.1831 122.0885 144.7060 3107.734   100
##  data_table_not 103.183 108.4070 142.3854 118.9920 139.6160 1898.757   100
##       dplyr_not 684.649 713.8085 759.9004 726.6820 744.4805 2975.568   100
fbox_plot(m13, "microseconds")

data.table melt vs tidyr pivot_longer

large_data <- data.table(
  id = 1:100000,
  var1 = rnorm(100000),
  var2 = rnorm(100000),
  var3 = rnorm(100000),
  var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
  tidyr_pivot_longer = {
    long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"), 
                                    names_to = "variable", values_to = "value")
  },
  data_table_melt = {
    long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable", 
                         value.name = "value")
  },
  times = 10
)

print(m14)
## Unit: microseconds
##                expr      min       lq      mean    median       uq       max
##  tidyr_pivot_longer 6448.133 6574.229 8775.3237 6812.5635 6962.845 26783.203
##     data_table_melt  444.881  474.647  555.0589  529.7945  635.467   727.048
##  neval
##     10
##     10
fbox_plot(m14, "microseconds")

data.table CJ vs tidyr expand_grid

vec1 <- seq(1000)
vec2 <- seq(1000)

# Define functions to be benchmarked
expand_grid_func <- function() {
  return(expand_grid(vec1, vec2))
}

CJ_func <- function() {
  return(CJ(vec1, vec2))
}

# Perform benchmarking
m15 <- microbenchmark(
  expand_grid = expand_grid_func(),
  CJ = CJ_func(),
  times = 10
)

print(m15)
## Unit: microseconds
##         expr      min       lq      mean    median       uq      max neval
##  expand_grid 2197.134 2213.805 2539.9229 2289.1205 2328.710 3829.502    10
##           CJ  475.828  486.258  649.3227  489.0785  509.942 1776.739    10
fbox_plot(m15, "microseconds")

data.table rbindlist vs R rbind

# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)

simple_bind <- function(list_of_dfs){
  do.call(rbind, list_of_dfs)
}

dplyr_bind <- function(list_of_dfs){
  bind_rows(list_of_dfs)
}

dt_bind <- function(list_of_dfs){
  rbindlist(list_of_dfs, fill = F)
}

# Benchmark both methods
m16 <- microbenchmark(
  dt_ver = dt_bind(df_list),
  simple = simple_bind(df_list),
  dplyr_ver = dplyr_bind(df_list),
  times = 30
)

print(m16)
## Unit: microseconds
##       expr       min        lq       mean     median        uq       max neval
##     dt_ver   468.646   490.977   597.0125   546.6915   569.704  2103.399    30
##     simple   471.561   510.464   608.2575   547.9030   617.403  2015.835    30
##  dplyr_ver 10344.651 10570.302 10759.8372 10678.8295 10853.681 11991.677    30
fbox_plot(m16, "microseconds")

stringr word vs tidyr separate vs data.table tstrsplit

set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]

# Using tidyr::separate
separate_words <- function() {
  df |> 
    separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |> 
    select(-c(w1, w2, w4))
}

# Using stringr::word
stringr_words <- function() {
  df |> 
    mutate(
      w3 = word(text, 3, sep = fixed(".")),
      w5 = word(text, 5, sep = fixed("."))
    )
}

datatable_words <- function() {
  df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}

m17 <- microbenchmark(
  separate = separate_words(),
  stringr = stringr_words(),
  dt = datatable_words(),
  times = 10
)

print(m17)
## Unit: milliseconds
##      expr       min        lq      mean    median        uq       max neval
##  separate  77.55017  78.71027  83.93366  80.09989  91.54156  95.72738    10
##   stringr 176.15070 178.83882 182.15315 181.00322 184.41627 194.52496    10
##        dt  12.80430  12.97534  13.40331  13.02553  13.65529  15.74265    10
fbox_plot(m17, "miliseconds")

data.table na_omit vs dplyr drop_na

# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
  x = rnorm(n),
  y = sample(c(NA, 1:100), n, replace = TRUE),
  z = sample(c(NA, letters), n, replace = TRUE),
  stringsAsFactors = F
)

# Benchmark both methods
m18 <- microbenchmark(
  dplyr_drop_na = {
    df |> drop_na()
  },
  data_table_na_omit = {
    dt |> na.omit()
  },
  times = 10
)

print(m18)
## Unit: microseconds
##                expr      min       lq      mean    median       uq      max
##       dplyr_drop_na 9678.687 9713.142 9817.8071 9726.5720 9750.902 10600.92
##  data_table_na_omit    9.097   10.149   52.5112   61.7805   65.682   165.60
##  neval
##     10
##     10
fbox_plot(m18, "microseconds")

Parallel processing

lapply vs parallel mclapply

# Sample data
set.seed(123)

size = 1e4
n_cores = parallelly::availableCores()

df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T), 
                       extra_value = runif(size))

# Sequential join
sequential_join <- function() {
  lapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  })
}

# Parallel join using mclapply
parallel_join <- function() {
  mclapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  }, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}

# Benchmark both methods
m19 <- microbenchmark(
  sequential = sequential_join(),
  parallel = parallel_join(),
  times = 10
)

print(m19)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##  sequential 301.5262 320.5787 357.6035 337.8800 387.5695 495.8454    10
##    parallel 155.6060 182.4350 190.2782 193.4453 198.0251 206.7770    10
fbox_plot(m19, "miliseconds")

dtplyr

This is another alternative (You need to install this package)

set.seed(123)
n <- 1e7
df <- data.table(
  group1 = sample(LETTERS[1:10], n, replace = TRUE),
  group2 = sample(letters[1:5], n, replace = TRUE),
  value1 = rnorm(n),
  value2 = runif(n, 1, 100)
)

m21 <- microbenchmark(
  basic_way = {
    dplyr <- df |> 
      filter(value1 > 0) |> 
      mutate(ratio = value1 / value2) |> 
      summarize(
        mean_val1 = mean(value1),
        sd_val1 = sd(value1),
        median_val2 = median(value2),
        max_ratio = max(ratio), .by = c("group1", "group2")) |> 
      as.data.table()
  },
  dtplyr_way = {
    dtplyr = df |> 
      lazy_dt() |> 
      filter(value1 > 0) |> 
      mutate(ratio = value1 / value2) |> 
      summarize(
        mean_val1 = mean(value1),
        sd_val1 = sd(value1),
        median_val2 = median(value2),
        max_ratio = max(ratio), .by = c("group1", "group2")) |> 
      as.data.table()
  },
  times = 5
)

print(m21)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##   basic_way 581.8151 594.4181 625.3249 612.0199 616.2274 722.1441     5
##  dtplyr_way 374.1814 408.1085 420.1764 415.0208 419.4515 484.1197     5
fbox_plot(m21, "miliseconds")

Duckdb

Dockdb files vs parquet

Parquet files may take longer if you partitioned by day. Consider to try partitioning by year or try to use duckdb. Another advantage using duckdb is memory consumption since you can wrangle using SQL statement.

Note: DuckDB requires to open a connection, consider the parameter read_only if you only want to get data. Don’t forget to close the connection.

# partitioned_by_day <- "/conf/posit_azure_logs/data/merge_uip_data_test"
partitioned_by_year <- "/conf/posit_azure_logs/test_290825/data/merge_uip_data_test"
my_duckdb <- "/conf/posit_azure_logs/test_290825/data/my_db.duckdb"

with_parquet <- function(folder_path){
  data_1 <- open_dataset(file.path(folder_path)) |>
    select(
      ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
      ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
      ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
      ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
      ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
      ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
      ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
      ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
      ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
    ) |>
    mutate(
      computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
      bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
      ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
      ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
      ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
      total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
      total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
      total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
      total_node_mem = computepool_node_mem + bigpool_node_mem,
      average_session_per_node = ifelse(ALL_WIP_node_total != 0,
                                         (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
    ) |>
    arrange(ALL_WIP_CP_day_session, ALL_WIP_CP_night_session, ALL_WIP_BP_day_session) |> 
    collect() |>
    as.data.table()
}

with_duckfile <- function(my_path){
  my_connection = dbConnect(duckdb::duckdb(), dbdir = my_path, read_only=TRUE)
  data_2 <- res_duckdb_sql <- dbGetQuery(
    my_connection,
    statement = "select
        ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
        ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
        ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
        ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
        ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
        ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
        ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
        ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
        ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
        ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
        ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
        ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
        ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
        ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
        ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
        ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
        ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
        computepool_node_mem + bigpool_node_mem as total_node_mem,
        CASE 
          WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
          ELSE 0
        END AS average_session_per_node
      from mytable order by ALL_WIP_CP_day_session, ALL_WIP_CP_night_session, ALL_WIP_BP_day_session",
    immediate = TRUE) |> 
    as.data.table()
  
  dbDisconnect(my_connection, shutdown = TRUE)
}

m22 <- microbenchmark(
  duckdb_file = with_duckfile(my_duckdb),
  parquet_by_year = with_parquet(partitioned_by_year),
  times = 2
)

print(m22)
## Unit: milliseconds
##             expr      min       lq     mean   median       uq      max neval
##      duckdb_file 496.7932 496.7932 606.4560 606.4560 716.1188 716.1188     2
##  parquet_by_year 576.0924 576.0924 703.0102 703.0102 829.9280 829.9280     2
fbox_plot(m22, "miliseconds")